Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bleach/linkifier.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 from __future__ import unicode_literals | |
2 import re | |
3 import six | |
4 | |
5 from bleach import callbacks as linkify_callbacks | |
6 from bleach import html5lib_shim | |
7 from bleach.utils import alphabetize_attributes, force_unicode | |
8 | |
9 | |
10 #: List of default callbacks | |
11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] | |
12 | |
13 | |
14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az | |
15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat | |
16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk | |
17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg | |
18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il | |
19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp | |
20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk | |
21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne | |
22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post | |
23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl | |
24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to | |
25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws | |
26 xn xxx ye yt yu za zm zw""".split() | |
27 | |
28 # Make sure that .com doesn't get matched by .co first | |
29 TLDS.reverse() | |
30 | |
31 | |
32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): | |
33 """Builds the url regex used by linkifier | |
34 | |
35 If you want a different set of tlds or allowed protocols, pass those in | |
36 and stomp on the existing ``url_re``:: | |
37 | |
38 from bleach import linkifier | |
39 | |
40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) | |
41 | |
42 linker = LinkifyFilter(url_re=my_url_re) | |
43 | |
44 """ | |
45 return re.compile( | |
46 r"""\(* # Match any opening parentheses. | |
47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// | |
48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? | |
49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? | |
50 # /path/zz (excluding "unsafe" chars from RFC 1738, | |
51 # except for # and ~, which happen in practice) | |
52 """.format( | |
53 "|".join(sorted(protocols)), "|".join(sorted(tlds)) | |
54 ), | |
55 re.IGNORECASE | re.VERBOSE | re.UNICODE, | |
56 ) | |
57 | |
58 | |
59 URL_RE = build_url_re() | |
60 | |
61 | |
62 PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE) | |
63 | |
64 | |
65 def build_email_re(tlds=TLDS): | |
66 """Builds the email regex used by linkifier | |
67 | |
68 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: | |
69 | |
70 from bleach import linkifier | |
71 | |
72 my_email_re = linkifier.build_email_re(my_tlds_list) | |
73 | |
74 linker = LinkifyFilter(email_re=my_url_re) | |
75 | |
76 """ | |
77 # open and closing braces doubled below for format string | |
78 return re.compile( | |
79 r"""(?<!//) | |
80 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ | |
81 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom | |
82 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] | |
83 |\\[\001-\011\013\014\016-\177])*" # quoted-string | |
84 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain | |
85 """.format( | |
86 "|".join(tlds) | |
87 ), | |
88 re.IGNORECASE | re.MULTILINE | re.VERBOSE, | |
89 ) | |
90 | |
91 | |
92 EMAIL_RE = build_email_re() | |
93 | |
94 | |
95 class Linker(object): | |
96 """Convert URL-like strings in an HTML fragment to links | |
97 | |
98 This function converts strings that look like URLs, domain names and email | |
99 addresses in text that may be an HTML fragment to links, while preserving: | |
100 | |
101 1. links already in the string | |
102 2. urls found in attributes | |
103 3. email addresses | |
104 | |
105 linkify does a best-effort approach and tries to recover from bad | |
106 situations due to crazy text. | |
107 | |
108 """ | |
109 | |
110 def __init__( | |
111 self, | |
112 callbacks=DEFAULT_CALLBACKS, | |
113 skip_tags=None, | |
114 parse_email=False, | |
115 url_re=URL_RE, | |
116 email_re=EMAIL_RE, | |
117 recognized_tags=html5lib_shim.HTML_TAGS, | |
118 ): | |
119 """Creates a Linker instance | |
120 | |
121 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
122 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
123 | |
124 :arg list skip_tags: list of tags that you don't want to linkify the | |
125 contents of; for example, you could set this to ``['pre']`` to skip | |
126 linkifying contents of ``pre`` tags | |
127 | |
128 :arg bool parse_email: whether or not to linkify email addresses | |
129 | |
130 :arg re url_re: url matching regex | |
131 | |
132 :arg re email_re: email matching regex | |
133 | |
134 :arg list-of-strings recognized_tags: the list of tags that linkify knows about; | |
135 everything else gets escaped | |
136 | |
137 :returns: linkified text as unicode | |
138 | |
139 """ | |
140 self.callbacks = callbacks | |
141 self.skip_tags = skip_tags | |
142 self.parse_email = parse_email | |
143 self.url_re = url_re | |
144 self.email_re = email_re | |
145 | |
146 # Create a parser/tokenizer that allows all HTML tags and escapes | |
147 # anything not in that list. | |
148 self.parser = html5lib_shim.BleachHTMLParser( | |
149 tags=recognized_tags, | |
150 strip=False, | |
151 consume_entities=True, | |
152 namespaceHTMLElements=False, | |
153 ) | |
154 self.walker = html5lib_shim.getTreeWalker("etree") | |
155 self.serializer = html5lib_shim.BleachHTMLSerializer( | |
156 quote_attr_values="always", | |
157 omit_optional_tags=False, | |
158 # linkify does not sanitize | |
159 sanitize=False, | |
160 # linkify alphabetizes | |
161 alphabetical_attributes=False, | |
162 ) | |
163 | |
164 def linkify(self, text): | |
165 """Linkify specified text | |
166 | |
167 :arg str text: the text to add links to | |
168 | |
169 :returns: linkified text as unicode | |
170 | |
171 :raises TypeError: if ``text`` is not a text type | |
172 | |
173 """ | |
174 if not isinstance(text, six.string_types): | |
175 raise TypeError("argument must be of text type") | |
176 | |
177 text = force_unicode(text) | |
178 | |
179 if not text: | |
180 return "" | |
181 | |
182 dom = self.parser.parseFragment(text) | |
183 filtered = LinkifyFilter( | |
184 source=self.walker(dom), | |
185 callbacks=self.callbacks, | |
186 skip_tags=self.skip_tags, | |
187 parse_email=self.parse_email, | |
188 url_re=self.url_re, | |
189 email_re=self.email_re, | |
190 ) | |
191 return self.serializer.render(filtered) | |
192 | |
193 | |
194 class LinkifyFilter(html5lib_shim.Filter): | |
195 """html5lib filter that linkifies text | |
196 | |
197 This will do the following: | |
198 | |
199 * convert email addresses into links | |
200 * convert urls into links | |
201 * edit existing links by running them through callbacks--the default is to | |
202 add a ``rel="nofollow"`` | |
203 | |
204 This filter can be used anywhere html5lib filters can be used. | |
205 | |
206 """ | |
207 | |
208 def __init__( | |
209 self, | |
210 source, | |
211 callbacks=DEFAULT_CALLBACKS, | |
212 skip_tags=None, | |
213 parse_email=False, | |
214 url_re=URL_RE, | |
215 email_re=EMAIL_RE, | |
216 ): | |
217 """Creates a LinkifyFilter instance | |
218 | |
219 :arg TreeWalker source: stream | |
220 | |
221 :arg list callbacks: list of callbacks to run when adjusting tag attributes; | |
222 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | |
223 | |
224 :arg list skip_tags: list of tags that you don't want to linkify the | |
225 contents of; for example, you could set this to ``['pre']`` to skip | |
226 linkifying contents of ``pre`` tags | |
227 | |
228 :arg bool parse_email: whether or not to linkify email addresses | |
229 | |
230 :arg re url_re: url matching regex | |
231 | |
232 :arg re email_re: email matching regex | |
233 | |
234 """ | |
235 super(LinkifyFilter, self).__init__(source) | |
236 | |
237 self.callbacks = callbacks or [] | |
238 self.skip_tags = skip_tags or [] | |
239 self.parse_email = parse_email | |
240 | |
241 self.url_re = url_re | |
242 self.email_re = email_re | |
243 | |
244 def apply_callbacks(self, attrs, is_new): | |
245 """Given an attrs dict and an is_new bool, runs through callbacks | |
246 | |
247 Callbacks can return an adjusted attrs dict or ``None``. In the case of | |
248 ``None``, we stop going through callbacks and return that and the link | |
249 gets dropped. | |
250 | |
251 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` | |
252 | |
253 :arg bool is_new: whether or not this link was added by linkify | |
254 | |
255 :returns: adjusted attrs dict or ``None`` | |
256 | |
257 """ | |
258 for cb in self.callbacks: | |
259 attrs = cb(attrs, is_new) | |
260 if attrs is None: | |
261 return None | |
262 return attrs | |
263 | |
264 def extract_character_data(self, token_list): | |
265 """Extracts and squashes character sequences in a token stream""" | |
266 # FIXME(willkg): This is a terrible idea. What it does is drop all the | |
267 # tags from the token list and merge the Characters and SpaceCharacters | |
268 # tokens into a single text. | |
269 # | |
270 # So something like this:: | |
271 # | |
272 # "<span>" "<b>" "some text" "</b>" "</span>" | |
273 # | |
274 # gets converted to "some text". | |
275 # | |
276 # This gets used to figure out the ``_text`` fauxttribute value for | |
277 # linkify callables. | |
278 # | |
279 # I'm not really sure how else to support that ``_text`` fauxttribute and | |
280 # maintain some modicum of backwards compatibility with previous versions | |
281 # of Bleach. | |
282 | |
283 out = [] | |
284 for token in token_list: | |
285 token_type = token["type"] | |
286 if token_type in ["Characters", "SpaceCharacters"]: | |
287 out.append(token["data"]) | |
288 | |
289 return "".join(out) | |
290 | |
291 def handle_email_addresses(self, src_iter): | |
292 """Handle email addresses in character tokens""" | |
293 for token in src_iter: | |
294 if token["type"] == "Characters": | |
295 text = token["data"] | |
296 new_tokens = [] | |
297 end = 0 | |
298 | |
299 # For each email address we find in the text | |
300 for match in self.email_re.finditer(text): | |
301 if match.start() > end: | |
302 new_tokens.append( | |
303 {"type": "Characters", "data": text[end : match.start()]} | |
304 ) | |
305 | |
306 # Run attributes through the callbacks to see what we | |
307 # should do with this match | |
308 attrs = { | |
309 (None, "href"): "mailto:%s" % match.group(0), | |
310 "_text": match.group(0), | |
311 } | |
312 attrs = self.apply_callbacks(attrs, True) | |
313 | |
314 if attrs is None: | |
315 # Just add the text--but not as a link | |
316 new_tokens.append( | |
317 {"type": "Characters", "data": match.group(0)} | |
318 ) | |
319 | |
320 else: | |
321 # Add an "a" tag for the new link | |
322 _text = attrs.pop("_text", "") | |
323 attrs = alphabetize_attributes(attrs) | |
324 new_tokens.extend( | |
325 [ | |
326 {"type": "StartTag", "name": "a", "data": attrs}, | |
327 {"type": "Characters", "data": force_unicode(_text)}, | |
328 {"type": "EndTag", "name": "a"}, | |
329 ] | |
330 ) | |
331 end = match.end() | |
332 | |
333 if new_tokens: | |
334 # Yield the adjusted set of tokens and then continue | |
335 # through the loop | |
336 if end < len(text): | |
337 new_tokens.append({"type": "Characters", "data": text[end:]}) | |
338 | |
339 for new_token in new_tokens: | |
340 yield new_token | |
341 | |
342 continue | |
343 | |
344 yield token | |
345 | |
346 def strip_non_url_bits(self, fragment): | |
347 """Strips non-url bits from the url | |
348 | |
349 This accounts for over-eager matching by the regex. | |
350 | |
351 """ | |
352 prefix = suffix = "" | |
353 | |
354 while fragment: | |
355 # Try removing ( from the beginning and, if it's balanced, from the | |
356 # end, too | |
357 if fragment.startswith("("): | |
358 prefix = prefix + "(" | |
359 fragment = fragment[1:] | |
360 | |
361 if fragment.endswith(")"): | |
362 suffix = ")" + suffix | |
363 fragment = fragment[:-1] | |
364 continue | |
365 | |
366 # Now try extraneous things from the end. For example, sometimes we | |
367 # pick up ) at the end of a url, but the url is in a parenthesized | |
368 # phrase like: | |
369 # | |
370 # "i looked at the site (at http://example.com)" | |
371 | |
372 if fragment.endswith(")") and "(" not in fragment: | |
373 fragment = fragment[:-1] | |
374 suffix = ")" + suffix | |
375 continue | |
376 | |
377 # Handle commas | |
378 if fragment.endswith(","): | |
379 fragment = fragment[:-1] | |
380 suffix = "," + suffix | |
381 continue | |
382 | |
383 # Handle periods | |
384 if fragment.endswith("."): | |
385 fragment = fragment[:-1] | |
386 suffix = "." + suffix | |
387 continue | |
388 | |
389 # Nothing matched, so we're done | |
390 break | |
391 | |
392 return fragment, prefix, suffix | |
393 | |
394 def handle_links(self, src_iter): | |
395 """Handle links in character tokens""" | |
396 in_a = False # happens, if parse_email=True and if a mail was found | |
397 for token in src_iter: | |
398 if in_a: | |
399 if token["type"] == "EndTag" and token["name"] == "a": | |
400 in_a = False | |
401 yield token | |
402 continue | |
403 elif token["type"] == "StartTag" and token["name"] == "a": | |
404 in_a = True | |
405 yield token | |
406 continue | |
407 if token["type"] == "Characters": | |
408 text = token["data"] | |
409 new_tokens = [] | |
410 end = 0 | |
411 | |
412 for match in self.url_re.finditer(text): | |
413 if match.start() > end: | |
414 new_tokens.append( | |
415 {"type": "Characters", "data": text[end : match.start()]} | |
416 ) | |
417 | |
418 url = match.group(0) | |
419 prefix = suffix = "" | |
420 | |
421 # Sometimes we pick up too much in the url match, so look for | |
422 # bits we should drop and remove them from the match | |
423 url, prefix, suffix = self.strip_non_url_bits(url) | |
424 | |
425 # If there's no protocol, add one | |
426 if PROTO_RE.search(url): | |
427 href = url | |
428 else: | |
429 href = "http://%s" % url | |
430 | |
431 attrs = {(None, "href"): href, "_text": url} | |
432 attrs = self.apply_callbacks(attrs, True) | |
433 | |
434 if attrs is None: | |
435 # Just add the text | |
436 new_tokens.append( | |
437 {"type": "Characters", "data": prefix + url + suffix} | |
438 ) | |
439 | |
440 else: | |
441 # Add the "a" tag! | |
442 if prefix: | |
443 new_tokens.append({"type": "Characters", "data": prefix}) | |
444 | |
445 _text = attrs.pop("_text", "") | |
446 attrs = alphabetize_attributes(attrs) | |
447 | |
448 new_tokens.extend( | |
449 [ | |
450 {"type": "StartTag", "name": "a", "data": attrs}, | |
451 {"type": "Characters", "data": force_unicode(_text)}, | |
452 {"type": "EndTag", "name": "a"}, | |
453 ] | |
454 ) | |
455 | |
456 if suffix: | |
457 new_tokens.append({"type": "Characters", "data": suffix}) | |
458 | |
459 end = match.end() | |
460 | |
461 if new_tokens: | |
462 # Yield the adjusted set of tokens and then continue | |
463 # through the loop | |
464 if end < len(text): | |
465 new_tokens.append({"type": "Characters", "data": text[end:]}) | |
466 | |
467 for new_token in new_tokens: | |
468 yield new_token | |
469 | |
470 continue | |
471 | |
472 yield token | |
473 | |
474 def handle_a_tag(self, token_buffer): | |
475 """Handle the "a" tag | |
476 | |
477 This could adjust the link or drop it altogether depending on what the | |
478 callbacks return. | |
479 | |
480 This yields the new set of tokens. | |
481 | |
482 """ | |
483 a_token = token_buffer[0] | |
484 if a_token["data"]: | |
485 attrs = a_token["data"] | |
486 else: | |
487 attrs = {} | |
488 text = self.extract_character_data(token_buffer) | |
489 attrs["_text"] = text | |
490 | |
491 attrs = self.apply_callbacks(attrs, False) | |
492 | |
493 if attrs is None: | |
494 # We're dropping the "a" tag and everything else and replacing | |
495 # it with character data. So emit that token. | |
496 yield {"type": "Characters", "data": text} | |
497 | |
498 else: | |
499 new_text = attrs.pop("_text", "") | |
500 a_token["data"] = alphabetize_attributes(attrs) | |
501 | |
502 if text == new_text: | |
503 # The callbacks didn't change the text, so we yield the new "a" | |
504 # token, then whatever else was there, then the end "a" token | |
505 yield a_token | |
506 for mem in token_buffer[1:]: | |
507 yield mem | |
508 | |
509 else: | |
510 # If the callbacks changed the text, then we're going to drop | |
511 # all the tokens between the start and end "a" tags and replace | |
512 # it with the new text | |
513 yield a_token | |
514 yield {"type": "Characters", "data": force_unicode(new_text)} | |
515 yield token_buffer[-1] | |
516 | |
517 def __iter__(self): | |
518 in_a = False | |
519 in_skip_tag = None | |
520 | |
521 token_buffer = [] | |
522 | |
523 for token in super(LinkifyFilter, self).__iter__(): | |
524 if in_a: | |
525 # Handle the case where we're in an "a" tag--we want to buffer tokens | |
526 # until we hit an end "a" tag. | |
527 if token["type"] == "EndTag" and token["name"] == "a": | |
528 # Add the end tag to the token buffer and then handle them | |
529 # and yield anything returned | |
530 token_buffer.append(token) | |
531 for new_token in self.handle_a_tag(token_buffer): | |
532 yield new_token | |
533 | |
534 # Clear "a" related state and continue since we've yielded all | |
535 # the tokens we're going to yield | |
536 in_a = False | |
537 token_buffer = [] | |
538 else: | |
539 token_buffer.append(token) | |
540 continue | |
541 | |
542 if token["type"] in ["StartTag", "EmptyTag"]: | |
543 if token["name"] in self.skip_tags: | |
544 # Skip tags start a "special mode" where we don't linkify | |
545 # anything until the end tag. | |
546 in_skip_tag = token["name"] | |
547 | |
548 elif token["name"] == "a": | |
549 # The "a" tag is special--we switch to a slurp mode and | |
550 # slurp all the tokens until the end "a" tag and then | |
551 # figure out what to do with them there. | |
552 in_a = True | |
553 token_buffer.append(token) | |
554 | |
555 # We buffer the start tag, so we don't want to yield it, | |
556 # yet | |
557 continue | |
558 | |
559 elif in_skip_tag and self.skip_tags: | |
560 # NOTE(willkg): We put this clause here since in_a and | |
561 # switching in and out of in_a takes precedence. | |
562 if token["type"] == "EndTag" and token["name"] == in_skip_tag: | |
563 in_skip_tag = None | |
564 | |
565 elif not in_a and not in_skip_tag and token["type"] == "Characters": | |
566 new_stream = iter([token]) | |
567 if self.parse_email: | |
568 new_stream = self.handle_email_addresses(new_stream) | |
569 | |
570 new_stream = self.handle_links(new_stream) | |
571 | |
572 for token in new_stream: | |
573 yield token | |
574 | |
575 # We've already yielded this token, so continue | |
576 continue | |
577 | |
578 yield token |