comparison env/lib/python3.9/site-packages/bleach/sanitizer.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 from __future__ import unicode_literals
2
3 from itertools import chain
4 import re
5 import warnings
6
7 import six
8 from six.moves.urllib.parse import urlparse
9 from xml.sax.saxutils import unescape
10
11 from bleach import html5lib_shim
12 from bleach.utils import alphabetize_attributes, force_unicode
13
14
15 #: List of allowed tags
16 ALLOWED_TAGS = [
17 "a",
18 "abbr",
19 "acronym",
20 "b",
21 "blockquote",
22 "code",
23 "em",
24 "i",
25 "li",
26 "ol",
27 "strong",
28 "ul",
29 ]
30
31
32 #: Map of allowed attributes by tag
33 ALLOWED_ATTRIBUTES = {
34 "a": ["href", "title"],
35 "abbr": ["title"],
36 "acronym": ["title"],
37 }
38
39 #: List of allowed styles
40 ALLOWED_STYLES = []
41
42 #: List of allowed protocols
43 ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
44
45 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
46 INVISIBLE_CHARACTERS = "".join(
47 [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
48 )
49
50 #: Regexp for characters that are invisible
51 INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
52
53 #: String to replace invisible characters with. This can be a character, a
54 #: string, or even a function that takes a Python re matchobj
55 INVISIBLE_REPLACEMENT_CHAR = "?"
56
57
58 class Cleaner(object):
59 """Cleaner for cleaning HTML fragments of malicious content
60
61 This cleaner is a security-focused function whose sole purpose is to remove
62 malicious content from a string such that it can be displayed as content in
63 a web page.
64
65 To use::
66
67 from bleach.sanitizer import Cleaner
68
69 cleaner = Cleaner()
70
71 for text in all_the_yucky_things:
72 sanitized = cleaner.clean(text)
73
74 .. Note::
75
76 This cleaner is not designed to use to transform content to be used in
77 non-web-page contexts.
78
79 .. Warning::
80
81 This cleaner is not thread-safe--the html parser has internal state.
82 Create a separate cleaner per thread!
83
84
85 """
86
87 def __init__(
88 self,
89 tags=ALLOWED_TAGS,
90 attributes=ALLOWED_ATTRIBUTES,
91 styles=ALLOWED_STYLES,
92 protocols=ALLOWED_PROTOCOLS,
93 strip=False,
94 strip_comments=True,
95 filters=None,
96 ):
97 """Initializes a Cleaner
98
99 :arg list tags: allowed list of tags; defaults to
100 ``bleach.sanitizer.ALLOWED_TAGS``
101
102 :arg dict attributes: allowed attributes; can be a callable, list or dict;
103 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
104
105 :arg list styles: allowed list of css styles; defaults to
106 ``bleach.sanitizer.ALLOWED_STYLES``
107
108 :arg list protocols: allowed list of protocols for links; defaults
109 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
110
111 :arg bool strip: whether or not to strip disallowed elements
112
113 :arg bool strip_comments: whether or not to strip HTML comments
114
115 :arg list filters: list of html5lib Filter classes to pass streamed content through
116
117 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
118
119 .. Warning::
120
121 Using filters changes the output of ``bleach.Cleaner.clean``.
122 Make sure the way the filters change the output are secure.
123
124 """
125 self.tags = tags
126 self.attributes = attributes
127 self.styles = styles
128 self.protocols = protocols
129 self.strip = strip
130 self.strip_comments = strip_comments
131 self.filters = filters or []
132
133 self.parser = html5lib_shim.BleachHTMLParser(
134 tags=self.tags,
135 strip=self.strip,
136 consume_entities=False,
137 namespaceHTMLElements=False,
138 )
139 self.walker = html5lib_shim.getTreeWalker("etree")
140 self.serializer = html5lib_shim.BleachHTMLSerializer(
141 quote_attr_values="always",
142 omit_optional_tags=False,
143 escape_lt_in_attrs=True,
144 # We want to leave entities as they are without escaping or
145 # resolving or expanding
146 resolve_entities=False,
147 # Bleach has its own sanitizer, so don't use the html5lib one
148 sanitize=False,
149 # Bleach sanitizer alphabetizes already, so don't use the html5lib one
150 alphabetical_attributes=False,
151 )
152
153 def clean(self, text):
154 """Cleans text and returns sanitized result as unicode
155
156 :arg str text: text to be cleaned
157
158 :returns: sanitized text as unicode
159
160 :raises TypeError: if ``text`` is not a text type
161
162 """
163 if not isinstance(text, six.string_types):
164 message = (
165 "argument cannot be of '{name}' type, must be of text type".format(
166 name=text.__class__.__name__
167 )
168 )
169 raise TypeError(message)
170
171 if not text:
172 return ""
173
174 text = force_unicode(text)
175
176 dom = self.parser.parseFragment(text)
177 filtered = BleachSanitizerFilter(
178 source=self.walker(dom),
179 # Bleach-sanitizer-specific things
180 attributes=self.attributes,
181 strip_disallowed_elements=self.strip,
182 strip_html_comments=self.strip_comments,
183 # html5lib-sanitizer things
184 allowed_elements=self.tags,
185 allowed_css_properties=self.styles,
186 allowed_protocols=self.protocols,
187 allowed_svg_properties=[],
188 )
189
190 # Apply any filters after the BleachSanitizerFilter
191 for filter_class in self.filters:
192 filtered = filter_class(source=filtered)
193
194 return self.serializer.render(filtered)
195
196
197 def attribute_filter_factory(attributes):
198 """Generates attribute filter function for the given attributes value
199
200 The attributes value can take one of several shapes. This returns a filter
201 function appropriate to the attributes value. One nice thing about this is
202 that there's less if/then shenanigans in the ``allow_token`` method.
203
204 """
205 if callable(attributes):
206 return attributes
207
208 if isinstance(attributes, dict):
209
210 def _attr_filter(tag, attr, value):
211 if tag in attributes:
212 attr_val = attributes[tag]
213 if callable(attr_val):
214 return attr_val(tag, attr, value)
215
216 if attr in attr_val:
217 return True
218
219 if "*" in attributes:
220 attr_val = attributes["*"]
221 if callable(attr_val):
222 return attr_val(tag, attr, value)
223
224 return attr in attr_val
225
226 return False
227
228 return _attr_filter
229
230 if isinstance(attributes, list):
231
232 def _attr_filter(tag, attr, value):
233 return attr in attributes
234
235 return _attr_filter
236
237 raise ValueError("attributes needs to be a callable, a list or a dict")
238
239
240 class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
241 """html5lib Filter that sanitizes text
242
243 This filter can be used anywhere html5lib filters can be used.
244
245 """
246
247 def __init__(
248 self,
249 source,
250 attributes=ALLOWED_ATTRIBUTES,
251 strip_disallowed_elements=False,
252 strip_html_comments=True,
253 **kwargs
254 ):
255 """Creates a BleachSanitizerFilter instance
256
257 :arg Treewalker source: stream
258
259 :arg list tags: allowed list of tags; defaults to
260 ``bleach.sanitizer.ALLOWED_TAGS``
261
262 :arg dict attributes: allowed attributes; can be a callable, list or dict;
263 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
264
265 :arg list styles: allowed list of css styles; defaults to
266 ``bleach.sanitizer.ALLOWED_STYLES``
267
268 :arg list protocols: allowed list of protocols for links; defaults
269 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
270
271 :arg bool strip_disallowed_elements: whether or not to strip disallowed
272 elements
273
274 :arg bool strip_html_comments: whether or not to strip HTML comments
275
276 """
277 self.attr_filter = attribute_filter_factory(attributes)
278 self.strip_disallowed_elements = strip_disallowed_elements
279 self.strip_html_comments = strip_html_comments
280
281 # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
282 warnings.filterwarnings(
283 "ignore",
284 message="html5lib's sanitizer is deprecated",
285 category=DeprecationWarning,
286 module="bleach._vendor.html5lib",
287 )
288 return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
289
290 def sanitize_stream(self, token_iterator):
291 for token in token_iterator:
292 ret = self.sanitize_token(token)
293
294 if not ret:
295 continue
296
297 if isinstance(ret, list):
298 for subtoken in ret:
299 yield subtoken
300 else:
301 yield ret
302
303 def merge_characters(self, token_iterator):
304 """Merge consecutive Characters tokens in a stream"""
305 characters_buffer = []
306
307 for token in token_iterator:
308 if characters_buffer:
309 if token["type"] == "Characters":
310 characters_buffer.append(token)
311 continue
312 else:
313 # Merge all the characters tokens together into one and then
314 # operate on it.
315 new_token = {
316 "data": "".join(
317 [char_token["data"] for char_token in characters_buffer]
318 ),
319 "type": "Characters",
320 }
321 characters_buffer = []
322 yield new_token
323
324 elif token["type"] == "Characters":
325 characters_buffer.append(token)
326 continue
327
328 yield token
329
330 new_token = {
331 "data": "".join([char_token["data"] for char_token in characters_buffer]),
332 "type": "Characters",
333 }
334 yield new_token
335
336 def __iter__(self):
337 return self.merge_characters(
338 self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
339 )
340
341 def sanitize_token(self, token):
342 """Sanitize a token either by HTML-encoding or dropping.
343
344 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
345 ['attribute', 'pairs'], 'tag': callable}.
346
347 Here callable is a function with two arguments of attribute name and
348 value. It should return true of false.
349
350 Also gives the option to strip tags instead of encoding.
351
352 :arg dict token: token to sanitize
353
354 :returns: token or list of tokens
355
356 """
357 token_type = token["type"]
358 if token_type in ["StartTag", "EndTag", "EmptyTag"]:
359 if token["name"] in self.allowed_elements:
360 return self.allow_token(token)
361
362 elif self.strip_disallowed_elements:
363 return None
364
365 else:
366 if "data" in token:
367 # Alphabetize the attributes before calling .disallowed_token()
368 # so that the resulting string is stable
369 token["data"] = alphabetize_attributes(token["data"])
370 return self.disallowed_token(token)
371
372 elif token_type == "Comment":
373 if not self.strip_html_comments:
374 # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
375 token["data"] = html5lib_shim.escape(
376 token["data"], entities={'"': "&quot;", "'": "&#x27;"}
377 )
378 return token
379 else:
380 return None
381
382 elif token_type == "Characters":
383 return self.sanitize_characters(token)
384
385 else:
386 return token
387
388 def sanitize_characters(self, token):
389 """Handles Characters tokens
390
391 Our overridden tokenizer doesn't do anything with entities. However,
392 that means that the serializer will convert all ``&`` in Characters
393 tokens to ``&amp;``.
394
395 Since we don't want that, we extract entities here and convert them to
396 Entity tokens so the serializer will let them be.
397
398 :arg token: the Characters token to work on
399
400 :returns: a list of tokens
401
402 """
403 data = token.get("data", "")
404
405 if not data:
406 return token
407
408 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
409 token["data"] = data
410
411 # If there isn't a & in the data, we can return now
412 if "&" not in data:
413 return token
414
415 new_tokens = []
416
417 # For each possible entity that starts with a "&", we try to extract an
418 # actual entity and re-tokenize accordingly
419 for part in html5lib_shim.next_possible_entity(data):
420 if not part:
421 continue
422
423 if part.startswith("&"):
424 entity = html5lib_shim.match_entity(part)
425 if entity is not None:
426 if entity == "amp":
427 # LinkifyFilter can't match urls across token boundaries
428 # which is problematic with &amp; since that shows up in
429 # querystrings all the time. This special-cases &amp;
430 # and converts it to a & and sticks it in as a
431 # Characters token. It'll get merged with surrounding
432 # tokens in the BleachSanitizerfilter.__iter__ and
433 # escaped in the serializer.
434 new_tokens.append({"type": "Characters", "data": "&"})
435 else:
436 new_tokens.append({"type": "Entity", "name": entity})
437
438 # Length of the entity plus 2--one for & at the beginning
439 # and one for ; at the end
440 remainder = part[len(entity) + 2 :]
441 if remainder:
442 new_tokens.append({"type": "Characters", "data": remainder})
443 continue
444
445 new_tokens.append({"type": "Characters", "data": part})
446
447 return new_tokens
448
449 def sanitize_uri_value(self, value, allowed_protocols):
450 """Checks a uri value to see if it's allowed
451
452 :arg value: the uri value to sanitize
453 :arg allowed_protocols: list of allowed protocols
454
455 :returns: allowed value or None
456
457 """
458 # NOTE(willkg): This transforms the value into one that's easier to
459 # match and verify, but shouldn't get returned since it's vastly
460 # different than the original value.
461
462 # Convert all character entities in the value
463 new_value = html5lib_shim.convert_entities(value)
464
465 # Nix backtick, space characters, and control characters
466 new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
467
468 # Remove REPLACEMENT characters
469 new_value = new_value.replace("\ufffd", "")
470
471 # Lowercase it--this breaks the value, but makes it easier to match
472 # against
473 new_value = new_value.lower()
474
475 try:
476 # Drop attributes with uri values that have protocols that aren't
477 # allowed
478 parsed = urlparse(new_value)
479 except ValueError:
480 # URI is impossible to parse, therefore it's not allowed
481 return None
482
483 if parsed.scheme:
484 # If urlparse found a scheme, check that
485 if parsed.scheme in allowed_protocols:
486 return value
487
488 else:
489 # Allow uris that are just an anchor
490 if new_value.startswith("#"):
491 return value
492
493 # Handle protocols that urlparse doesn't recognize like "myprotocol"
494 if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
495 return value
496
497 # If there's no protocol/scheme specified, then assume it's "http"
498 # and see if that's allowed
499 if "http" in allowed_protocols:
500 return value
501
502 return None
503
504 def allow_token(self, token):
505 """Handles the case where we're allowing the tag"""
506 if "data" in token:
507 # Loop through all the attributes and drop the ones that are not
508 # allowed, are unsafe or break other rules. Additionally, fix
509 # attribute values that need fixing.
510 #
511 # At the end of this loop, we have the final set of attributes
512 # we're keeping.
513 attrs = {}
514 for namespaced_name, val in token["data"].items():
515 namespace, name = namespaced_name
516
517 # Drop attributes that are not explicitly allowed
518 #
519 # NOTE(willkg): We pass in the attribute name--not a namespaced
520 # name.
521 if not self.attr_filter(token["name"], name, val):
522 continue
523
524 # Drop attributes with uri values that use a disallowed protocol
525 # Sanitize attributes with uri values
526 if namespaced_name in self.attr_val_is_uri:
527 new_value = self.sanitize_uri_value(val, self.allowed_protocols)
528 if new_value is None:
529 continue
530 val = new_value
531
532 # Drop values in svg attrs with non-local IRIs
533 if namespaced_name in self.svg_attr_val_allows_ref:
534 new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
535 new_val = new_val.strip()
536 if not new_val:
537 continue
538
539 else:
540 # Replace the val with the unescaped version because
541 # it's a iri
542 val = new_val
543
544 # Drop href and xlink:href attr for svg elements with non-local IRIs
545 if (None, token["name"]) in self.svg_allow_local_href:
546 if namespaced_name in [
547 (None, "href"),
548 (html5lib_shim.namespaces["xlink"], "href"),
549 ]:
550 if re.search(r"^\s*[^#\s]", val):
551 continue
552
553 # If it's a style attribute, sanitize it
554 if namespaced_name == (None, "style"):
555 val = self.sanitize_css(val)
556
557 # At this point, we want to keep the attribute, so add it in
558 attrs[namespaced_name] = val
559
560 token["data"] = alphabetize_attributes(attrs)
561
562 return token
563
564 def disallowed_token(self, token):
565 token_type = token["type"]
566 if token_type == "EndTag":
567 token["data"] = "</%s>" % token["name"]
568
569 elif token["data"]:
570 assert token_type in ("StartTag", "EmptyTag")
571 attrs = []
572 for (ns, name), v in token["data"].items():
573 # If we end up with a namespace, but no name, switch them so we
574 # have a valid name to use.
575 if ns and not name:
576 ns, name = name, ns
577
578 # Figure out namespaced name if the namespace is appropriate
579 # and exists; if the ns isn't in prefixes, then drop it.
580 if ns is None or ns not in html5lib_shim.prefixes:
581 namespaced_name = name
582 else:
583 namespaced_name = "%s:%s" % (html5lib_shim.prefixes[ns], name)
584
585 attrs.append(
586 ' %s="%s"'
587 % (
588 namespaced_name,
589 # NOTE(willkg): HTMLSerializer escapes attribute values
590 # already, so if we do it here (like HTMLSerializer does),
591 # then we end up double-escaping.
592 v,
593 )
594 )
595 token["data"] = "<%s%s>" % (token["name"], "".join(attrs))
596
597 else:
598 token["data"] = "<%s>" % token["name"]
599
600 if token.get("selfClosing"):
601 token["data"] = token["data"][:-1] + "/>"
602
603 token["type"] = "Characters"
604
605 del token["name"]
606 return token
607
608 def sanitize_css(self, style):
609 """Sanitizes css in style tags"""
610 # Convert entities in the style so that it can be parsed as CSS
611 style = html5lib_shim.convert_entities(style)
612
613 # Drop any url values before we do anything else
614 style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
615
616 # The gauntlet of sanitization
617
618 # Validate the css in the style tag and if it's not valid, then drop
619 # the whole thing.
620 parts = style.split(";")
621 gauntlet = re.compile(
622 r"""^( # consider a style attribute value as composed of:
623 [/:,#%!.\s\w] # a non-newline character
624 |\w-\w # 3 characters in the form \w-\w
625 |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
626 |"[\s\w]+" # a double quoted string of [\s\w]+
627 |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
628 )*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
629 flags=re.U | re.VERBOSE,
630 )
631
632 for part in parts:
633 if not gauntlet.match(part):
634 return ""
635
636 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
637 return ""
638
639 clean = []
640 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
641 if not value:
642 continue
643
644 if prop.lower() in self.allowed_css_properties:
645 clean.append(prop + ": " + value + ";")
646
647 elif prop.lower() in self.allowed_svg_properties:
648 clean.append(prop + ": " + value + ";")
649
650 return " ".join(clean)