comparison env/lib/python3.9/site-packages/bleach/linkifier.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 from __future__ import unicode_literals
2 import re
3 import six
4
5 from bleach import callbacks as linkify_callbacks
6 from bleach import html5lib_shim
7 from bleach.utils import alphabetize_attributes, force_unicode
8
9
10 #: List of default callbacks
11 DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
12
13
14 TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
15 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
16 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
17 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
18 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
19 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
20 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
21 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
22 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
23 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
24 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
25 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
26 xn xxx ye yt yu za zm zw""".split()
27
28 # Make sure that .com doesn't get matched by .co first
29 TLDS.reverse()
30
31
32 def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
33 """Builds the url regex used by linkifier
34
35 If you want a different set of tlds or allowed protocols, pass those in
36 and stomp on the existing ``url_re``::
37
38 from bleach import linkifier
39
40 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
41
42 linker = LinkifyFilter(url_re=my_url_re)
43
44 """
45 return re.compile(
46 r"""\(* # Match any opening parentheses.
47 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
48 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
49 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
50 # /path/zz (excluding "unsafe" chars from RFC 1738,
51 # except for # and ~, which happen in practice)
52 """.format(
53 "|".join(sorted(protocols)), "|".join(sorted(tlds))
54 ),
55 re.IGNORECASE | re.VERBOSE | re.UNICODE,
56 )
57
58
59 URL_RE = build_url_re()
60
61
62 PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
63
64
65 def build_email_re(tlds=TLDS):
66 """Builds the email regex used by linkifier
67
68 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
69
70 from bleach import linkifier
71
72 my_email_re = linkifier.build_email_re(my_tlds_list)
73
74 linker = LinkifyFilter(email_re=my_url_re)
75
76 """
77 # open and closing braces doubled below for format string
78 return re.compile(
79 r"""(?<!//)
80 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
81 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
82 |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
83 |\\[\001-\011\013\014\016-\177])*" # quoted-string
84 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
85 """.format(
86 "|".join(tlds)
87 ),
88 re.IGNORECASE | re.MULTILINE | re.VERBOSE,
89 )
90
91
92 EMAIL_RE = build_email_re()
93
94
95 class Linker(object):
96 """Convert URL-like strings in an HTML fragment to links
97
98 This function converts strings that look like URLs, domain names and email
99 addresses in text that may be an HTML fragment to links, while preserving:
100
101 1. links already in the string
102 2. urls found in attributes
103 3. email addresses
104
105 linkify does a best-effort approach and tries to recover from bad
106 situations due to crazy text.
107
108 """
109
110 def __init__(
111 self,
112 callbacks=DEFAULT_CALLBACKS,
113 skip_tags=None,
114 parse_email=False,
115 url_re=URL_RE,
116 email_re=EMAIL_RE,
117 recognized_tags=html5lib_shim.HTML_TAGS,
118 ):
119 """Creates a Linker instance
120
121 :arg list callbacks: list of callbacks to run when adjusting tag attributes;
122 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
123
124 :arg list skip_tags: list of tags that you don't want to linkify the
125 contents of; for example, you could set this to ``['pre']`` to skip
126 linkifying contents of ``pre`` tags
127
128 :arg bool parse_email: whether or not to linkify email addresses
129
130 :arg re url_re: url matching regex
131
132 :arg re email_re: email matching regex
133
134 :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
135 everything else gets escaped
136
137 :returns: linkified text as unicode
138
139 """
140 self.callbacks = callbacks
141 self.skip_tags = skip_tags
142 self.parse_email = parse_email
143 self.url_re = url_re
144 self.email_re = email_re
145
146 # Create a parser/tokenizer that allows all HTML tags and escapes
147 # anything not in that list.
148 self.parser = html5lib_shim.BleachHTMLParser(
149 tags=recognized_tags,
150 strip=False,
151 consume_entities=True,
152 namespaceHTMLElements=False,
153 )
154 self.walker = html5lib_shim.getTreeWalker("etree")
155 self.serializer = html5lib_shim.BleachHTMLSerializer(
156 quote_attr_values="always",
157 omit_optional_tags=False,
158 # linkify does not sanitize
159 sanitize=False,
160 # linkify alphabetizes
161 alphabetical_attributes=False,
162 )
163
164 def linkify(self, text):
165 """Linkify specified text
166
167 :arg str text: the text to add links to
168
169 :returns: linkified text as unicode
170
171 :raises TypeError: if ``text`` is not a text type
172
173 """
174 if not isinstance(text, six.string_types):
175 raise TypeError("argument must be of text type")
176
177 text = force_unicode(text)
178
179 if not text:
180 return ""
181
182 dom = self.parser.parseFragment(text)
183 filtered = LinkifyFilter(
184 source=self.walker(dom),
185 callbacks=self.callbacks,
186 skip_tags=self.skip_tags,
187 parse_email=self.parse_email,
188 url_re=self.url_re,
189 email_re=self.email_re,
190 )
191 return self.serializer.render(filtered)
192
193
194 class LinkifyFilter(html5lib_shim.Filter):
195 """html5lib filter that linkifies text
196
197 This will do the following:
198
199 * convert email addresses into links
200 * convert urls into links
201 * edit existing links by running them through callbacks--the default is to
202 add a ``rel="nofollow"``
203
204 This filter can be used anywhere html5lib filters can be used.
205
206 """
207
208 def __init__(
209 self,
210 source,
211 callbacks=DEFAULT_CALLBACKS,
212 skip_tags=None,
213 parse_email=False,
214 url_re=URL_RE,
215 email_re=EMAIL_RE,
216 ):
217 """Creates a LinkifyFilter instance
218
219 :arg TreeWalker source: stream
220
221 :arg list callbacks: list of callbacks to run when adjusting tag attributes;
222 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
223
224 :arg list skip_tags: list of tags that you don't want to linkify the
225 contents of; for example, you could set this to ``['pre']`` to skip
226 linkifying contents of ``pre`` tags
227
228 :arg bool parse_email: whether or not to linkify email addresses
229
230 :arg re url_re: url matching regex
231
232 :arg re email_re: email matching regex
233
234 """
235 super(LinkifyFilter, self).__init__(source)
236
237 self.callbacks = callbacks or []
238 self.skip_tags = skip_tags or []
239 self.parse_email = parse_email
240
241 self.url_re = url_re
242 self.email_re = email_re
243
244 def apply_callbacks(self, attrs, is_new):
245 """Given an attrs dict and an is_new bool, runs through callbacks
246
247 Callbacks can return an adjusted attrs dict or ``None``. In the case of
248 ``None``, we stop going through callbacks and return that and the link
249 gets dropped.
250
251 :arg dict attrs: map of ``(namespace, name)`` -> ``value``
252
253 :arg bool is_new: whether or not this link was added by linkify
254
255 :returns: adjusted attrs dict or ``None``
256
257 """
258 for cb in self.callbacks:
259 attrs = cb(attrs, is_new)
260 if attrs is None:
261 return None
262 return attrs
263
264 def extract_character_data(self, token_list):
265 """Extracts and squashes character sequences in a token stream"""
266 # FIXME(willkg): This is a terrible idea. What it does is drop all the
267 # tags from the token list and merge the Characters and SpaceCharacters
268 # tokens into a single text.
269 #
270 # So something like this::
271 #
272 # "<span>" "<b>" "some text" "</b>" "</span>"
273 #
274 # gets converted to "some text".
275 #
276 # This gets used to figure out the ``_text`` fauxttribute value for
277 # linkify callables.
278 #
279 # I'm not really sure how else to support that ``_text`` fauxttribute and
280 # maintain some modicum of backwards compatibility with previous versions
281 # of Bleach.
282
283 out = []
284 for token in token_list:
285 token_type = token["type"]
286 if token_type in ["Characters", "SpaceCharacters"]:
287 out.append(token["data"])
288
289 return "".join(out)
290
291 def handle_email_addresses(self, src_iter):
292 """Handle email addresses in character tokens"""
293 for token in src_iter:
294 if token["type"] == "Characters":
295 text = token["data"]
296 new_tokens = []
297 end = 0
298
299 # For each email address we find in the text
300 for match in self.email_re.finditer(text):
301 if match.start() > end:
302 new_tokens.append(
303 {"type": "Characters", "data": text[end : match.start()]}
304 )
305
306 # Run attributes through the callbacks to see what we
307 # should do with this match
308 attrs = {
309 (None, "href"): "mailto:%s" % match.group(0),
310 "_text": match.group(0),
311 }
312 attrs = self.apply_callbacks(attrs, True)
313
314 if attrs is None:
315 # Just add the text--but not as a link
316 new_tokens.append(
317 {"type": "Characters", "data": match.group(0)}
318 )
319
320 else:
321 # Add an "a" tag for the new link
322 _text = attrs.pop("_text", "")
323 attrs = alphabetize_attributes(attrs)
324 new_tokens.extend(
325 [
326 {"type": "StartTag", "name": "a", "data": attrs},
327 {"type": "Characters", "data": force_unicode(_text)},
328 {"type": "EndTag", "name": "a"},
329 ]
330 )
331 end = match.end()
332
333 if new_tokens:
334 # Yield the adjusted set of tokens and then continue
335 # through the loop
336 if end < len(text):
337 new_tokens.append({"type": "Characters", "data": text[end:]})
338
339 for new_token in new_tokens:
340 yield new_token
341
342 continue
343
344 yield token
345
346 def strip_non_url_bits(self, fragment):
347 """Strips non-url bits from the url
348
349 This accounts for over-eager matching by the regex.
350
351 """
352 prefix = suffix = ""
353
354 while fragment:
355 # Try removing ( from the beginning and, if it's balanced, from the
356 # end, too
357 if fragment.startswith("("):
358 prefix = prefix + "("
359 fragment = fragment[1:]
360
361 if fragment.endswith(")"):
362 suffix = ")" + suffix
363 fragment = fragment[:-1]
364 continue
365
366 # Now try extraneous things from the end. For example, sometimes we
367 # pick up ) at the end of a url, but the url is in a parenthesized
368 # phrase like:
369 #
370 # "i looked at the site (at http://example.com)"
371
372 if fragment.endswith(")") and "(" not in fragment:
373 fragment = fragment[:-1]
374 suffix = ")" + suffix
375 continue
376
377 # Handle commas
378 if fragment.endswith(","):
379 fragment = fragment[:-1]
380 suffix = "," + suffix
381 continue
382
383 # Handle periods
384 if fragment.endswith("."):
385 fragment = fragment[:-1]
386 suffix = "." + suffix
387 continue
388
389 # Nothing matched, so we're done
390 break
391
392 return fragment, prefix, suffix
393
394 def handle_links(self, src_iter):
395 """Handle links in character tokens"""
396 in_a = False # happens, if parse_email=True and if a mail was found
397 for token in src_iter:
398 if in_a:
399 if token["type"] == "EndTag" and token["name"] == "a":
400 in_a = False
401 yield token
402 continue
403 elif token["type"] == "StartTag" and token["name"] == "a":
404 in_a = True
405 yield token
406 continue
407 if token["type"] == "Characters":
408 text = token["data"]
409 new_tokens = []
410 end = 0
411
412 for match in self.url_re.finditer(text):
413 if match.start() > end:
414 new_tokens.append(
415 {"type": "Characters", "data": text[end : match.start()]}
416 )
417
418 url = match.group(0)
419 prefix = suffix = ""
420
421 # Sometimes we pick up too much in the url match, so look for
422 # bits we should drop and remove them from the match
423 url, prefix, suffix = self.strip_non_url_bits(url)
424
425 # If there's no protocol, add one
426 if PROTO_RE.search(url):
427 href = url
428 else:
429 href = "http://%s" % url
430
431 attrs = {(None, "href"): href, "_text": url}
432 attrs = self.apply_callbacks(attrs, True)
433
434 if attrs is None:
435 # Just add the text
436 new_tokens.append(
437 {"type": "Characters", "data": prefix + url + suffix}
438 )
439
440 else:
441 # Add the "a" tag!
442 if prefix:
443 new_tokens.append({"type": "Characters", "data": prefix})
444
445 _text = attrs.pop("_text", "")
446 attrs = alphabetize_attributes(attrs)
447
448 new_tokens.extend(
449 [
450 {"type": "StartTag", "name": "a", "data": attrs},
451 {"type": "Characters", "data": force_unicode(_text)},
452 {"type": "EndTag", "name": "a"},
453 ]
454 )
455
456 if suffix:
457 new_tokens.append({"type": "Characters", "data": suffix})
458
459 end = match.end()
460
461 if new_tokens:
462 # Yield the adjusted set of tokens and then continue
463 # through the loop
464 if end < len(text):
465 new_tokens.append({"type": "Characters", "data": text[end:]})
466
467 for new_token in new_tokens:
468 yield new_token
469
470 continue
471
472 yield token
473
474 def handle_a_tag(self, token_buffer):
475 """Handle the "a" tag
476
477 This could adjust the link or drop it altogether depending on what the
478 callbacks return.
479
480 This yields the new set of tokens.
481
482 """
483 a_token = token_buffer[0]
484 if a_token["data"]:
485 attrs = a_token["data"]
486 else:
487 attrs = {}
488 text = self.extract_character_data(token_buffer)
489 attrs["_text"] = text
490
491 attrs = self.apply_callbacks(attrs, False)
492
493 if attrs is None:
494 # We're dropping the "a" tag and everything else and replacing
495 # it with character data. So emit that token.
496 yield {"type": "Characters", "data": text}
497
498 else:
499 new_text = attrs.pop("_text", "")
500 a_token["data"] = alphabetize_attributes(attrs)
501
502 if text == new_text:
503 # The callbacks didn't change the text, so we yield the new "a"
504 # token, then whatever else was there, then the end "a" token
505 yield a_token
506 for mem in token_buffer[1:]:
507 yield mem
508
509 else:
510 # If the callbacks changed the text, then we're going to drop
511 # all the tokens between the start and end "a" tags and replace
512 # it with the new text
513 yield a_token
514 yield {"type": "Characters", "data": force_unicode(new_text)}
515 yield token_buffer[-1]
516
517 def __iter__(self):
518 in_a = False
519 in_skip_tag = None
520
521 token_buffer = []
522
523 for token in super(LinkifyFilter, self).__iter__():
524 if in_a:
525 # Handle the case where we're in an "a" tag--we want to buffer tokens
526 # until we hit an end "a" tag.
527 if token["type"] == "EndTag" and token["name"] == "a":
528 # Add the end tag to the token buffer and then handle them
529 # and yield anything returned
530 token_buffer.append(token)
531 for new_token in self.handle_a_tag(token_buffer):
532 yield new_token
533
534 # Clear "a" related state and continue since we've yielded all
535 # the tokens we're going to yield
536 in_a = False
537 token_buffer = []
538 else:
539 token_buffer.append(token)
540 continue
541
542 if token["type"] in ["StartTag", "EmptyTag"]:
543 if token["name"] in self.skip_tags:
544 # Skip tags start a "special mode" where we don't linkify
545 # anything until the end tag.
546 in_skip_tag = token["name"]
547
548 elif token["name"] == "a":
549 # The "a" tag is special--we switch to a slurp mode and
550 # slurp all the tokens until the end "a" tag and then
551 # figure out what to do with them there.
552 in_a = True
553 token_buffer.append(token)
554
555 # We buffer the start tag, so we don't want to yield it,
556 # yet
557 continue
558
559 elif in_skip_tag and self.skip_tags:
560 # NOTE(willkg): We put this clause here since in_a and
561 # switching in and out of in_a takes precedence.
562 if token["type"] == "EndTag" and token["name"] == in_skip_tag:
563 in_skip_tag = None
564
565 elif not in_a and not in_skip_tag and token["type"] == "Characters":
566 new_stream = iter([token])
567 if self.parse_email:
568 new_stream = self.handle_email_addresses(new_stream)
569
570 new_stream = self.handle_links(new_stream)
571
572 for token in new_stream:
573 yield token
574
575 # We've already yielded this token, so continue
576 continue
577
578 yield token